企业面临库存积压、客户流失、物流成本高等问题,亟需通过数据驱动优化运营。现有海量交易数据(40万+订单),但缺乏系统性分析框架,导致决策效率低下,利润持续承压。
本项目围绕销售、客户、库存、物流四大核心模块,通过数据挖掘、预测建模与策略优化,实现商业效率全面提升。
import pandas as pd
import numpy as np
df = pd.read_excel('./数据/Online Retail.xlsx')
df.sample(9)
df.info()
df.isnull().sum()
df[['Quantity', 'UnitPrice']].describe()
df_clean = df.dropna(subset=['CustomerID'])
df_clean
len(df_clean)
df_clean.isna().sum()
# 过滤 Quantity 为负数的记录(退货订单单独分析)
# 注意:退货需关联原始订单,此处仅保留正常销售记录
df_clean = df_clean[df_clean['Quantity'] > 0]
# 过滤异常价格(如 UnitPrice <=0 或极端高价)
df_clean = df_clean[(df_clean['UnitPrice'] > 0) & (df_clean['UnitPrice'] < 1000)]
# 检查清洗后的统计摘要
print("\n清洗后数值字段统计摘要:")
print(df_clean[['Quantity', 'UnitPrice']].describe())
# 标记退货订单(InvoiceNo以'C'开头)
df_clean['IsReturn'] = df_clean['InvoiceNo'].astype(str).str.startswith('C')
# 可选:将退货订单单独存储(用于后续分析)
df_returns = df_clean[df_clean['IsReturn']]
# 仅保留正常销售订单(若需排除退货)
df_sales = df_clean[~df_clean['IsReturn']]
# 删除完全重复的行
df_clean = df_clean.drop_duplicates()
# 转换 InvoiceDate 为 datetime 类型
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])
df_clean.sample(10)
# 保存为新的 CSV 文件
df_clean.to_csv('Online_Retail_Clean.csv', index=False)
print("\n清洗后的数据已保存为 'Online_Retail_Clean.csv'")
pip install duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import duckdb
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_csv(
'./Online_Retail_Clean.csv',
parse_dates=['InvoiceDate'], # 强制解析日期字段!
dtype={'InvoiceNo': str} # 保持InvoiceNo为字符串
)
query_step1 = '''
SELECT
strftime(
CAST(InvoiceDate AS TIMESTAMP), -- 显式类型转换!
'%Y-%m'
) AS Month,
SUM(UnitPrice * Quantity) AS TotalSales
FROM df
WHERE
Quantity > 0
AND NOT (InvoiceNo LIKE 'C%')
GROUP BY Month
ORDER BY Month;
'''
df_step1 = duckdb.sql(query_step1).to_df()
df_step1
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
def plot_sales_trend(df, figsize=(12,6), title='月度销售额趋势分析',
line_color='#2E86C1', marker='o', grid_alpha=0.3):
"""
绘制月度销售额折线图
参数:
df : 必需含 'Month'(YYYY-MM格式) 和 'TotalSales' 列的DataFrame
figsize : 图表尺寸,默认(12,6)
title : 标题,支持中文
line_color: 折线颜色,支持HEX/RGB/颜色名称
marker : 数据点样式,默认圆形 ('o'),设为None可隐藏
grid_alpha: 网格线透明度(0为透明,1为不透明)
"""
# 预处理数据(确保日期排序正确)
df = df.copy()
df['Date'] = pd.to_datetime(df['Month']) # 转为datetime类型保证排序
df.sort_values('Date', inplace=True)
plt.figure(figsize=figsize)
ax = plt.gca()
# 绘制折线
sns.lineplot(
data=df, x='Date', y='TotalSales',
marker=marker, color=line_color,
linewidth=2.5, markersize=8,
markeredgecolor='white', markeredgewidth=1
)
# 添加数据标签(动态旋转角度防止重叠)
for idx, row in df.iterrows():
ax.text(
row['Date'], row['TotalSales'] + 20000, # 文字位置微调
f"{row['TotalSales']/1000:.1f}K", # 自动转K显示
ha='center', va='bottom',
rotation=45 if len(df) > 12 else 0, # 月份多则倾斜标签
fontsize=9, alpha=0.75
)
# 高级日期轴格式化
locator = mdates.MonthLocator(interval=1) # 每月显示刻度
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# 美化样式
plt.title(title, fontsize=14, pad=20, fontweight='bold')
plt.xlabel('月份', fontsize=12, labelpad=10)
plt.ylabel('销售额 ', fontsize=12, labelpad=10)
plt.grid(True, linestyle='--', alpha=grid_alpha)
plt.xticks(rotation=45) # 统一旋转防止重叠
# 智能Y轴范围调整(留出顶部空间)
ymin, ymax = ax.get_ylim()
ax.set_ylim(ymin, ymax*1.1) # 上方留10%空位
sns.despine() # 去除多余边框
plt.tight_layout()
plt.show()
# 调用函数 (一步生成专业图表)
plot_sales_trend(
df_step1,
title='在线零售每月销售额趋势分析 (2010.12-2011.12)',
line_color='#E67E22', # 修改为橙色
marker='D', # 菱形标记点
grid_alpha=0.4
)
峰值月份:2011年11月(可能受“双11”促销驱动,销售额显著上升)。
低谷月份:2011年2月(春节假期导致物流停运,销售下滑)。
平稳期:2011年5-8月(无大促,销售额波动较小)。
query_step2 = """
SELECT
StockCode,
Description,
SUM(Quantity) AS TotalQuantitySold,
SUM(UnitPrice * Quantity) AS TotalRevenue
FROM df
WHERE Quantity > 0
GROUP BY StockCode, Description -- 必须包含所有非聚合字段!!
ORDER BY TotalQuantitySold DESC -- 使用别名排序
LIMIT 10;
-- 2. 如果还需要按销售额排序添加新查询(不能直接接在前一条语句末尾)
SELECT
StockCode,
Description,
SUM(Quantity) AS TotalQuantitySold,
SUM(UnitPrice * Quantity) AS TotalRevenue
FROM df
WHERE Quantity > 0
GROUP BY StockCode, Description
ORDER BY TotalRevenue DESC -- 修改排序字段为销售额
LIMIT 10;
"""
df_step2 = duckdb.sql(query_step2).to_df()
df_step2
query_step3 = """
WITH CountrySales AS (
SELECT
Country,
SUM(UnitPrice * Quantity) AS Sales,
SUM(SUM(UnitPrice * Quantity)) OVER() AS GlobalSales -- 总销售额
FROM df
GROUP BY Country
)
SELECT
Country,
Sales,
Sales * 100.0 / GlobalSales AS Percentage,
SUM(Sales) OVER(ORDER BY Sales DESC) / GlobalSales AS CumulativePercentage
FROM CountrySales
ORDER BY Sales DESC;
"""
df_step3 = duckdb.sql(query_step3).to_df()
df_step3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# 加载清洗后的数据(假设文件名为 'Online_Retail_Clean.csv')
df = pd.read_csv('Online_Retail_Clean.csv', parse_dates=['InvoiceDate'])
print("数据前5行:")
df.head()
# 设定当前日期(假设数据最新日期为分析日期)
current_date = df['InvoiceDate'].max() + pd.DateOffset(days=1)
# 计算每个客户的 RFM 值
rfm = df.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (current_date - x.max()).days, # Recency(最近一次消费距今天数)
'InvoiceNo': 'nunique', # Frequency(消费次数)
'UnitPrice': lambda x: (x * df.loc[x.index, 'Quantity']).sum() # Monetary(总消费金额)
}).reset_index()
# 重命名列
rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']
print("\nRFM 数据示例:")
rfm.head()
print("【检查Frequency阈值】")
print(rfm['Frequency'].value_counts().sort_index())
print("\n【观察Monetary分位点】")
print(rfm['Monetary'].quantile([0.25,0.5,0.75,1.0]))
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=4, labels=range(4,0,-1), duplicates='drop')
# 针对Frequency定制分箱边界(单位:购买次数)
frequency_bins = [0, 1, 3, 8, np.inf] # 左开右闭区间
frequency_labels = ['低活跃', '次活跃', '一般活跃', '高活跃']
rfm['F_Score'] = pd.cut(
rfm['Frequency'],
bins=frequency_bins,
labels=range(1,5), # 此处用数字标签是为了后续RFM总分计算
right=True, # 右侧闭合:例如 (1,3] 包括3
include_lowest=False # (0,1] => 不包含0,但第一个箱的起始是0.0
)
# 先对Monetary取对数(缩小极差值)
rfm['Monetary_log'] = np.log10(rfm['Monetary'] + 1e-5) # 避免0值无法取对数
# 命中分布的分位数(调整后)
quantiles = rfm['Monetary_log'].quantile([0.25, 0.5, 0.75, 1.0])
print("转换后分位数:")
print(quantiles)
# 将信息转换为实际货币分箱边界(逆转换)
actual_bins = [0] + [10**q for q in quantiles.iloc[:-1]] + [rfm['Monetary'].max()]
print("\n实际分箱边界(Monetary原始值):")
print([round(b,2) for b in actual_bins])
# 执行分箱
rfm['M_Score'] = pd.cut(
rfm['Monetary'],
bins=actual_bins,
labels=range(1,5),
include_lowest=True # 包含最小值?
) #需要根据默认值进行调整
rfm.head()
print("\n 验证分箱覆盖情况(Frequency):")
print(rfm['F_Score'].value_counts().sort_index())
print("\n 查看Monetary分箱边界有效性:")
print(rfm.groupby('M_Score')['Monetary'].agg(['min','max','count']).round(2))
# Python 条件筛选实现示例
conditions = [
(rfm['R_Score'] >=4) & (rfm['F_Score']==4) & (rfm['M_Score']==4), # 至尊VIP
((rfm['F_Score'] <=2) | (rfm['M_Score'] <=2)) & (rfm['R_Score'] >=4), # 高潜力客户
(rfm['R_Score'] >=3) & (rfm['M_Score'] >=3) & (rfm['F_Score'] >=3), # 常规忠诚客户
(rfm['R_Score'] <=2) & (rfm['F_Score'] >=3) & (rfm['M_Score'] >=3), # 沉睡用户
(rfm['R_Score'] <=2) & ((rfm['F_Score'] <=2) | (rfm['M_Score'] <=2)), # 流失风险
(rfm['F_Score'] ==1) | ((rfm['F_Score'] >1) & (rfm['M_Score']==1) & (rfm['R_Score'] <=3)) # 长尾低频
]
groups = ["至尊VIP", "高潜力客户", "常规忠诚客户", "沉睡用户", "流失风险客户", "长尾低频客户"]
rfm['客户层级'] = np.select(conditions, groups, default='其他未归类客户')
rfm.to_csv('rfm_segments.csv')
# 统计各群体人数
segmented = rfm['客户层级'].value_counts().reset_index()
segmented.columns = ['客户层级', '人数']
segmented['占比'] = (segmented['人数'] / segmented['人数'].sum() *100).round(1)
segmented
pip install plotly
from matplotlib import cm
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from wordcloud import WordCloud
import plotly.express as px # 使用Plotly生成动态图表
def visualize_customer_segments(df, fig_format='interactive', top_n=8):
"""
客户分群分布的可视化驾驶舱
参数:
df : 必须包含 ['客户层级', 'count'] 的聚合DataFrame
fig_format : 图表输出格式 ('interactive'动态 | 'static'静态)
top_n : 显示占比前N的细分群组(其余归为'其他')
"""
# 预处理: 计算占比并合并长尾群组
grouped = df['客户层级'].value_counts().reset_index()
grouped.columns = ['客户层级', '人数']
grouped['占比'] = grouped['人数'] / grouped['人数'].sum()
# 合并长尾群组(提升可读性)
if top_n < len(grouped):
main_groups = grouped.nlargest(top_n, '人数')
others_count = grouped['人数'].sum() - main_groups['人数'].sum()
others_row = pd.DataFrame([['其他客户群', others_count, others_count/grouped['人数'].sum()]],
columns=grouped.columns)
grouped = pd.concat([main_groups, others_row])
# 智能配色方案(根据业务语义优化)
color_dict = {
'至尊VIP': '#FF6B6B', # 高光红色
'常规忠诚客户': '#4ECDC4', # 信任感的青绿色
'高潜力客户': '#FFD93D', # 充满希望的亮黄色
'沉睡用户': '#6C5B7B', # 神秘紫色代表休眠
'流失风险客户': '#FF6B6B', # 警示红色
'长尾低频客户': '#A8D8EA', # 冷色系
'其他客户群': '#D3D3D3' # 中性灰
}
colors = [color_dict.get(g, '#808080') for g in grouped['客户层级']]
if fig_format == 'interactive':
# 生成交互式旭日图 (需要安装 plotly: pip install plotly)
fig = px.sunburst(
grouped,
path=['客户层级'],
values='人数',
color='客户层级',
color_discrete_map=color_dict,
title='客户分层分布分析 (点击展开细分)',
width=800, height=600
)
fig.show()
return fig
else:
plt.figure(figsize=(15, 8), dpi=120)
# 双图布局:主图+文字说明
plt.subplot(1, 2, 1)
# 气泡图展示多维信息(大小=销售额占比)
bubble_sizes = np.sqrt(grouped['人数']) * 30 # 面积反映人数规模
scatter = plt.scatter(
x=range(len(grouped)),
y=grouped['人数'],
s=bubble_sizes,
c=colors,
alpha=0.7,
edgecolors='w'
)
plt.xticks(ticks=range(len(grouped)), labels=grouped['客户层级'], rotation=45, ha='right')
plt.ylabel('客户人数')
plt.title('客户群分布规模气泡图', fontsize=14)
plt.grid(alpha=0.2)
# 副图:词云展示层级权重 (需安装 wordcloud)
plt.subplot(1, 2, 2)
wordcloud = WordCloud(
width=400,
height=300,
background_color='white',
colormap='viridis' # 使用颜色映射提高辨识度
).generate_from_frequencies({k:v for k,v in zip(grouped['客户层级'], grouped['人数'])})
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('客户群权重词云', fontsize=14)
plt.tight_layout()
plt.show()
return None
#调用生成Interactive版
visualize_customer_segments(rfm, fig_format='interactive')
# 计算每个客户的首次和末次购买日期
customer_lifecycle = df.groupby('CustomerID').agg({
'InvoiceDate': ['min', 'max']
}).reset_index()
customer_lifecycle.columns = ['CustomerID', 'FirstPurchase', 'LastPurchase']
# 计算生命周期(天数)
customer_lifecycle['Lifetime'] = (customer_lifecycle['LastPurchase'] - customer_lifecycle['FirstPurchase']).dt.days
print("\n客户生命周期示例:")
customer_lifecycle.head()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
df = pd.read_csv('Online_Retail_Clean.csv', parse_dates=['InvoiceDate'])
# 计算客户生命周期(优化列名处理)
customer_lifecycle = df.groupby('CustomerID').agg(
FirstPurchase=('InvoiceDate', 'min'),
LastPurchase=('InvoiceDate', 'max'),
TotalPurchases=('InvoiceNo', 'nunique'),
TotalSpent=('UnitPrice', 'sum')
).reset_index()
# 计算生命周期(处理异常值)
customer_lifecycle['LifetimeDays'] = (customer_lifecycle['LastPurchase'] -
customer_lifecycle['FirstPurchase']).dt.days
# 处理只购买过一次的客户
customer_lifecycle['LifetimeDays'] = customer_lifecycle['LifetimeDays'].clip(lower=0)
# 基础统计
print("基础分析:")
print(f"客户总数:{len(customer_lifecycle)}")
print(f"平均生命周期:{customer_lifecycle['LifetimeDays'].mean():.1f} 天")
print(f"最长生命周期:{customer_lifecycle['LifetimeDays'].max()} 天")
print(f"中位数生命周期:{customer_lifecycle['LifetimeDays'].median()} 天")
print(f"只购买一次的客户占比:{(customer_lifecycle['LifetimeDays'] == 0).mean()*100:.1f}%")
# 新增分析维度
# 1. 客户分层
customer_lifecycle['CustomerSegment'] = pd.cut(
customer_lifecycle['LifetimeDays'],
bins=[-1, 0, 30, 90, 365, np.inf],
labels=['SinglePurchase', '<1M', '1-3M', '3M-1Y', '1Y+']
)
# 2. 平均消费间隔
purchase_counts = df.groupby(['CustomerID', 'InvoiceDate']).size().reset_index()
purchase_intervals = purchase_counts.groupby('CustomerID')['InvoiceDate'].apply(lambda x: x.diff().mean())
customer_lifecycle = customer_lifecycle.merge(
purchase_intervals.rename('AvgPurchaseInterval'),
how='left',
on='CustomerID'
)
# 可视化分析
plt.figure(figsize=(15, 10))
plt.suptitle('Customer Lifecycle Analysis', y=1.02)
# 1. 生命周期分布直方图
plt.subplot(2, 2, 1)
sns.histplot(data=customer_lifecycle[customer_lifecycle['LifetimeDays'] > 0],
x='LifetimeDays', bins=30, kde=True)
plt.title('Customer Lifetime Distribution')
plt.xlabel('Days Between First/Last Purchase')
plt.ylabel('Customer Count')
# 2. 客户分层分布
plt.subplot(2, 2, 2)
segment_dist = customer_lifecycle['CustomerSegment'].value_counts(normalize=True).sort_index()
segment_dist.plot(kind='bar', color='teal')
plt.title('Customer Segment Distribution')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
# 3. 生命周期与总消费金额关系
plt.subplot(2, 2, 3)
sns.scatterplot(data=customer_lifecycle, x='LifetimeDays', y='TotalSpent', alpha=0.6)
plt.title('Lifetime vs Total Spending')
plt.yscale('log') # 处理长尾分布
# 4. 购买频率与生命周期关系
plt.subplot(2, 2, 4)
sns.boxplot(data=customer_lifecycle, x='CustomerSegment', y='TotalPurchases',
showfliers=False, palette='Blues')
plt.title('Purchase Frequency by Segment')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 高级分析
print("\n价值客户发现:")
high_value = customer_lifecycle[
(customer_lifecycle['CustomerSegment'].isin(['1Y+', '3M-1Y'])) &
(customer_lifecycle['TotalSpent'] > customer_lifecycle['TotalSpent'].quantile(0.75))
]
print(f"高价值客户数量:{len(high_value)}")
print(f"高价值客户平均消费金额:{high_value['TotalSpent'].mean():.2f}")
print(f"高价值客户占比:{len(high_value)/len(customer_lifecycle)*100:.1f}%")
df['OrderID'] = df['InvoiceDate'].dt.strftime('%Y%m%d') + '_' + df['CustomerID'].astype(str)
unique_orders = df[['CustomerID', 'OrderID']].drop_duplicates() # 关键:每个客户每日计一次订单
unique_orders['InvoiceDate'] = df['InvoiceDate']
unique_orders
def calculate_repeat_rate(orders_df, time_unit='M'):
"""计算全局复购率及分时段趋势
Args:
orders_df : 包含['CustomerID', 'OrderID', 'InvoiceDate']的DataFrame
time_unit : 趋势分析的时间单位 ('D'-天, 'W'-周, 'M'-月, 'Q'-季)
Returns:
global_rate: 总体复购率 (有复购的客户占比)
trend_df : 分时段复购率 (含当期总客户、复购客户、复购率列)
"""
# 全局复购率
customer_stats = orders_df.groupby('CustomerID')['OrderID'].nunique().reset_index()
customer_stats.columns = ['CustomerID', 'TotalOrders']
repeat_customers = customer_stats[customer_stats['TotalOrders'] > 1]['CustomerID'].nunique()
total_customers = customer_stats['CustomerID'].nunique()
global_rate = round(repeat_customers / total_customers, 4)
# 分时段趋势复购率
orders_df['Period'] = orders_df['InvoiceDate'].dt.to_period(time_unit)
period_stats = orders_df.groupby(['Period', 'CustomerID']).size().reset_index()
period_stats.columns = ['Period', 'CustomerID', 'OrdersInPeriod']
# 标记客户是否在当期复购(该时段订单≥2则复购=1)
period_stats['RepeatFlag'] = (period_stats['OrdersInPeriod'] >= 2).astype(int)
trend_df = period_stats.groupby('Period').agg(
TotalCustomers=('CustomerID', 'nunique'),
RepeatCustomers=('RepeatFlag', 'sum')
).reset_index()
trend_df['RepeatRate'] = trend_df['RepeatCustomers'] / trend_df['TotalCustomers']
trend_df['Period'] = trend_df['Period'].astype(str)
return global_rate, trend_df
# 调用示例
global_repeat_rate, monthly_trend = calculate_repeat_rate(unique_orders)
print(f"全局客户复购率: {global_repeat_rate*100:.2f}%")
monthly_trend
import matplotlib.pyplot as plt
import seaborn as sns
# 风格设置
plt.style.use('ggplot')
sns.set_palette("pastel")
def plot_repeat_trend(trend_df, title='月度客户复购率趋势分析'):
"""绘制复购率走势图 (带数据标签)"""
plt.figure(figsize=(12,6))
ax = sns.lineplot(data=trend_df, x='Period', y='RepeatRate',
marker='o', linewidth=2.5, markersize=8)
# 数据标签及格式化
for idx, row in trend_df.iterrows():
ax.text(row['Period'], row['RepeatRate'] + 0.01,
f"{row['RepeatRate']*100:.1f}%",
ha='center', va='bottom', fontsize=9)
# 坐标轴优化
plt.title(title, fontsize=14, pad=20)
plt.xlabel('时段', fontsize=12)
plt.ylabel('复购率', fontsize=12)
plt.ylim(0, trend_df['RepeatRate'].max() * 1.2)
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
sns.despine()
plt.tight_layout()
plt.show()
# 调用示例 (自动适应月度/季度格式)
plot_repeat_trend(monthly_trend)
客户复购率从高点持续下滑,尤其在部分月份(如复购率仅为11.7%的异常月份)表现显著低于平均水平。此外,促销活动期间的复购率提升效果短暂,未能形成长期留存。这表明当前客户留存机制存在不足,需系统性优化。
业务建议: 将大促拆解为预热期(种草内容投放)→高潮期(限时折扣)→返场期(会员专属返场),延长客户参与周期。设计跨品类满减(如“美妆+家居满599减50”),推动客户尝试新品类;买赠活动搭配高潜力新品(如“买2件防晒赠小样”),降低尝鲜门槛。在促销页嵌入会员注册入口(如“注册会员立享额外9折”),活动期间同步引导关注社交账号(如扫码加企微领售后券)。
# 计算每个客户的首购和第二次购买时间
first_purchase = (
unique_orders
.groupby('CustomerID', as_index=False)['InvoiceDate']
.min() # 首购时间
.rename(columns={'InvoiceDate': 'InvoiceDate_First'}) # 提前重命名
)
second_purchase = (
unique_orders
.groupby('CustomerID')
.apply(lambda x: x.iloc[1]['InvoiceDate'] if len(x) >=2 else pd.NaT) # 批量取第二次购买时间
.reset_index()
.rename(columns={0: 'InvoiceDate_Second'}) # 直接定义结果列名
.dropna() # 仅保留成功取到二次购的客户
)
# 合并并计算天数差
purchase_intervals = first_purchase.merge(second_purchase, on='CustomerID', suffixes=('_First', '_Second'))
purchase_intervals['DaysToRepeat'] = (purchase_intervals['InvoiceDate_Second'] - purchase_intervals['InvoiceDate_First']).dt.days
# 统计复购时间分布
print("\n复购时间间隔分布 (天):")
print(purchase_intervals['DaysToRepeat'].describe(percentiles=[0.25,0.5,0.75,0.95]))
# 可视化复购间隔直方图
plt.figure(figsize=(10,5))
sns.histplot(data=purchase_intervals, x='DaysToRepeat', bins=30, kde=True)
plt.title('客户首次复购时间间隔分布', fontsize=12)
plt.xlabel('天数')
plt.ylabel('客户数量')
plt.show()
集中区间:峰值在 0-80 天,表明大部分客户在 1-3 个月内复购。
长尾分布:少量客户复购时间远超平均周期(如 200-350 天),需特殊干预。
针对性业务建议
短期高频客户(复购 ≤ 50 天) 对高频商品(如日用品)提供定期自动补货订阅,锁定需求。设置阶梯式积分体系,复购间隔越短,奖励越高(如满 3 次送礼品)。 示例:客户 A 在 20 天内复购,赠送双倍积分或限时折扣券。
中期复购客户(复购 50-250 天) 在客户首次购买后 30 天、60 天发送个性化推荐(基于购买历史)。针对关联商品(如洗发水 + 护发素)提供组合优惠,缩短复购周期。 示例:客户 B 购买咖啡机后,30 天推送咖啡豆优惠券。
长期低频客户(复购 ≥ 250 天) 标记超过 100 天未复购的客户,启动召回活动(如“我们想你了”短信 + 专属折扣)。对 200 天以上未复购客户,提供高价值优惠(如满 100 减 30)。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
# ==== 数据预处理 ====
# 假设原始数据路径(根据实际情况调整)
df = pd.read_csv('Online_Retail_Clean.csv', parse_dates=['InvoiceDate'])
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')] # 排除退货订单(保留正常交易)
# ==== 定义多级异常检测规则 ====
def detect_anomalies(df):
""" 异常订单标记器:输出含异常标签的DataFrame """
anomalies = df.copy()
# 规则1: 金额异常(单价或总价超阈值)
price_upper = anomalies['UnitPrice'].quantile(0.999) # 排除千分之一的极端高单价
anomalies['Amount'] = anomalies['UnitPrice'] * anomalies['Quantity']
anomalies['Price_Anomaly'] = np.where(
(anomalies['UnitPrice'] <= 0) |
(anomalies['UnitPrice'] > price_upper) |
(anomalies['Amount'] > 1e5), # 单笔订单金额超过10万
1, 0
)
# 规则2: 批量重复购买(同一客户同商品高频率购买)
dup_orders = anomalies.groupby(['CustomerID', 'StockCode']).size().reset_index(name='Count')
duplicates = dup_orders[dup_orders['Count'] > 5] # 同一商品购买超过5次视为可疑
anomalies = anomalies.merge(
duplicates, on=['CustomerID', 'StockCode'], how='left', indicator='Dup_Anomaly'
)
anomalies['Dup_Anomaly'] = np.where(anomalies['Dup_Anomaly'] == 'both', 1, 0)
# 规则3: 时间窗口高频下单(同一客户短时密集交易)
anomalies.sort_values(['CustomerID', 'InvoiceDate'], inplace=True)
anomalies['TimeDiff'] = anomalies.groupby('CustomerID')['InvoiceDate'].diff().dt.total_seconds() / 60 # 分钟间隔
anomalies['Time_Anomaly'] = np.where(
anomalies['TimeDiff'] < 5, 1, 0 # 5分钟内连续下单
)
# 规则4: 非营业时间订单(假设营业时间9:00-22:00)
anomalies['Hour'] = anomalies['InvoiceDate'].dt.hour
anomalies['OffHours_Anomaly'] = np.where(
(anomalies['Hour'] < 9) | (anomalies['Hour'] >= 22), 1, 0
)
# 综合标记(有任何异常则标记为1)
anomaly_cols = ['Price_Anomaly', 'Dup_Anomaly', 'Time_Anomaly', 'OffHours_Anomaly']
anomalies['Is_Anomaly'] = anomalies[anomaly_cols].max(axis=1)
return anomalies
# 执行异常检测
anomaly_df = detect_anomalies(df)
# ==== 统计各类异常分布 ====
anomaly_stats = pd.DataFrame({
'异常类型': ['价格异常', '重复购买异常', '短时高频交易', '非营业时间订单', '总异常订单'],
'订单数': [
anomaly_df['Price_Anomaly'].sum(),
anomaly_df['Dup_Anomaly'].sum(),
anomaly_df['Time_Anomaly'].sum(),
anomaly_df['OffHours_Anomaly'].sum(),
anomaly_df['Is_Anomaly'].sum()
]
})
anomaly_stats['占比(%)'] = (anomaly_stats['订单数'] / len(anomaly_df) * 100).round(2)
print("\n异常订单统计总览:")
anomaly_stats
短时高频交易(95.57%):系统被自动化脚本或黄牛党恶意攻击,订单真实性几乎为零,导致业务安全形同虚设。风控系统未能有效限制高频交易,且缺乏必要的验证手段(如人机识别、IP限制等)。
重复购买异常(10.71%):同一用户频繁刷单,恶意占用促销资源,导致真实客户无法参与。活动规则存在明显漏洞(如无限次参与、未设置用户限购)。
非营业时间订单(2.31%):系统在非营业时间成为黑产提款工具,未设定基本的营业时间拦截或时区逻辑错误,造成安全漏洞。
价格异常(0.10%):价格管理混乱,低级错误频繁发生(如小数点错位、促销价未及时同步)。缺乏有效的价格审核流程,技术和运营部门均存在失职。
短时高频交易:立即封堵漏洞引入人机验证(如Google reCAPTCHA)拦截非人类流量,并通过IP/设备ID限流:每个IP每秒请求不超过3次,每日订单限制为50笔。
重复购买异常:补漏规则,精准打击对同一用户ID/手机号/支付账号,单个活动限制购买1次。对高风险商品(如iPhone)实行强制实名认证和人脸识别。通过历史订单逆向清洗,追回被恶意占用的资源(如取消订单、回收优惠券)。
非营业时间订单:物理隔离,强制将服务器时间绑定到营业时间(如9:00-22:00),超时订单自动熔断。非营业时段前端页面显示“系统维护”,关闭下单入口。
价格异常:严格流程控制,所有价格调整需经过“运营提交→财务审核→技术灰度发布”三重关卡,缺一不可。上线前必须进行全面回归测试,重点验证价格边界情况(如0元、负价、过长小数位等)。
# ==== 可视化分析 ====
plt.figure(figsize=(14, 8))
# 子图1: 异常类型分布
plt.subplot(2, 2, 1)
sns.barplot(x='异常类型', y='订单数', data=anomaly_stats[:-1]) # 排除总异常避免重复
plt.title('各类型异常订单数量对比')
plt.xticks(rotation=30)
# 子图2: 价格异常分布(检查价格合理性)
plt.subplot(2, 2, 2)
sns.boxplot(x=anomaly_df['UnitPrice'], showfliers=False) # 隐藏极端值
plt.title('单价分布箱线图(排除离群值)')
plt.xlabel('单价')
# 子图3: 非营业时间订单时段分析
plt.subplot(2, 2, 3)
hour_counts = anomaly_df[anomaly_df['OffHours_Anomaly'] == 1]['Hour'].value_counts().sort_index()
sns.lineplot(x=hour_counts.index, y=hour_counts.values, marker='o')
plt.title('非营业时间异常订单时段分布')
plt.xticks(range(0,24))
plt.grid(alpha=0.3)
# 子图4: 短时高频时间差分布(仅展示异常点)
plt.subplot(2, 2, 4)
sns.histplot(anomaly_df[anomaly_df['Time_Anomaly'] == 1]['TimeDiff'], bins=20, kde=True)
plt.title('短时间隔交易分布 (<5分钟)')
plt.xlabel('时间间隔 (分钟)')
plt.tight_layout()
plt.show()
# ==== 输出异常订单明细(供进一步审查) ====
critical_anomalies = anomaly_df[
(anomaly_df['Is_Anomaly'] == 1) &
(((anomaly_df['Price_Anomaly'] == 1) & (anomaly_df['Amount'] > 1e4)) |
(anomaly_df['Dup_Anomaly'] == 1))
].sort_values('Amount', ascending=False) # 按金额降序排列
print("\n需人工复核的严重异常订单Top10:")
critical_anomalies[['InvoiceNo', 'CustomerID', 'StockCode', 'Amount', 'InvoiceDate']].head(10)
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Online_Retail_Clean.csv', encoding='ISO-8859-1')
print("数据前3行:")
df.head(3)
pip install mlxtend
# 按 InvoiceNo 分组,将同一订单的商品合并为列表
transactions = (
df.groupby('InvoiceNo')['Description']
.apply(lambda x: list(x.unique())) # 去重同一订单中的重复商品
.reset_index(name='Items')
)
print("\n交易数据示例:")
transactions.head()
# 将商品列表转换为布尔矩阵
te = TransactionEncoder()
te_ary = te.fit(transactions['Items']).transform(transactions['Items'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
print("\n编码后数据示例(商品矩阵):")
df_encoded.iloc[:3, :5] # 显示前3行和前5列
# 设置最小支持度(示例:支持度 >= 1% 的项集)
frequent_itemsets = apriori(
df_encoded,
min_support=0.01, # 可调整阈值
use_colnames=True
)
print("\n频繁项集示例:")
frequent_itemsets.sort_values('support', ascending=False).head(10)
高频商品特征:
单品高频:如 JUMBO BAG RED RETROSPOT(支持度 8.64%)、LUNCH BAG RED RETROSPOT(支持度 6.96%),表明这些商品本身销量较高。
组合高频:如 REGENCY CAKESTAND 3 TIER(支持度 9.20%),可能与其他商品形成固定搭配(如茶具套装)。
业务意义:
库存管理:高频单品需保证充足库存。
促销重点:可对高频商品(如 WHITE HANGING HEART T-LIGHT HOLDER)设计促销活动,吸引流量。
# 提取关联规则(按置信度和提升度过滤)
rules = association_rules(
frequent_itemsets,
metric="lift",
min_threshold=1.2 # 仅保留提升度 > 1.2 的规则
)
# 按置信度降序排序
rules = rules.sort_values('confidence', ascending=False)
print("\n关联规则示例:")
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)
同系列商品:如 REGENCY TEA PLATE PINK → REGENCY TEA PLATE GREEN(置信度 90.18%,提升度 61.85),说明顾客倾向于购买同一系列的不同颜色商品。
场景化搭配:如茶具(REGENCY CAKESTAND 3 TIER)与茶杯(GREEN REGENCY TEACUP)的强关联,反映下午茶场景需求。
业务意义:
交叉销售:将强关联商品(如 REGENCY TEA PLATE 不同颜色)捆绑销售或推荐。
页面优化:在商品详情页展示高置信度关联商品(如购买卧室装饰后推荐厨房装饰)。
rules.to_csv('association_rules.csv', index=False)
# 提取前20个频繁项集
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
top_itemsets = frequent_itemsets.nlargest(20, 'support')
plt.figure(figsize=(12, 6))
sns.barplot(x='support', y=top_itemsets['itemsets'].apply(lambda x: ', '.join(list(x))), data=top_itemsets)
plt.title("Top 20 高频商品组合(支持度)")
plt.xlabel("支持度")
plt.ylabel("商品组合")
plt.show()
业务建议: 捆绑组合,榨干场景价值,将 T-LIGHT HOLDER + CAKESTAND + PARTY BUNTING 打包为“终极派对套装”,价格锚定在单买总价的 70%,强制拉升客单价。
支持度 < 0.03 的商品(如 LUNCH BAG SUKI DESIGN)立即下架,停止生产,释放库存资金。
# 筛选高置信度和高提升度的规则
strong_rules = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 2)]
plt.figure(figsize=(10, 6))
sns.scatterplot(x='support', y='confidence', size='lift', data=strong_rules, hue='lift', palette='viridis')
plt.title("强关联规则分布(支持度 vs 置信度)")
plt.xlabel("支持度")
plt.ylabel("置信度")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
def get_recommendations(target_product, rules_df, top_n=5):
"""
根据关联规则生成推荐商品列表
"""
recommendations = (
rules_df[rules_df['antecedents'].apply(lambda x: target_product in x)]
.sort_values('lift', ascending=False)
.head(top_n)
)
return recommendations[['antecedents', 'consequents', 'lift']]
# 示例:为 "CHRISTMAS TREE DECORATION" 生成推荐
target_item = "CHRISTMAS TREE DECORATION"
recommendations = get_recommendations(target_item, rules)
print(f"\n与 '{target_item}' 强关联的商品推荐:")
recommendations
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
# 加载清洗后的数据和 RFM 分群结果
df = pd.read_csv('Online_Retail_Clean.csv', encoding='ISO-8859-1', parse_dates=['InvoiceDate'])
rfm_segments = pd.read_csv('rfm_segments.csv') # 已保存 RFM 分群结果
# 筛选圣诞季订单(2011年12月)
df_christmas = df[
(df['InvoiceDate'] >= '2011-12-01') &
(df['InvoiceDate'] <= '2011-12-31')
]
# 生成圣诞季交易数据
transactions_christmas = (
df_christmas.groupby('InvoiceNo')['Description']
.apply(lambda x: list(x.unique()))
.reset_index(name='Items')
)
# 编码并生成频繁项集
te = TransactionEncoder()
te_ary = te.fit_transform(transactions_christmas['Items'])
df_encoded_christmas = pd.DataFrame(te_ary, columns=te.columns_)
# 圣诞季关联规则挖掘
frequent_itemsets_christmas = apriori(
df_encoded_christmas,
min_support=0.02, # 降低支持度阈值(适应节日稀疏数据)
use_colnames=True
)
rules_christmas = association_rules(
frequent_itemsets_christmas,
metric="lift",
min_threshold=2.0
).sort_values('confidence', ascending=False)
print("圣诞季强关联规则:")
rules_christmas[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()
客户购买多款暖手器(如 RED LOVE HEART + BIRD DESIGN → OWL DESIGN)的置信度达 90%,但提升度仅 10.6,表明圣诞季购买行为高度集中但关联性有限,可能是节日送礼场景驱动。
本质问题:节日营销短视,仅靠促销拉动销量,未建立长期品牌认知,客户复购可能随促销结束断崖式下跌。
# 合并 RFM 分群数据
df = pd.merge(df, rfm_segments[['CustomerID', '客户层级']], on='CustomerID')
# 筛选高价值客户订单
df_high_value = df[df['客户层级'] == '至尊VIP']
# 生成高价值客户交易数据
transactions_high_value = (
df_high_value.groupby('InvoiceNo')['Description']
.apply(lambda x: list(x.unique()))
.reset_index(name='Items')
)
# 编码并生成频繁项集
te_ary_high = te.fit_transform(transactions_high_value['Items'])
df_encoded_high = pd.DataFrame(te_ary_high, columns=te.columns_)
# 高价值客户关联规则挖掘
frequent_itemsets_high = apriori(
df_encoded_high,
min_support=0.01, # 支持度阈值根据高价值客户密度调整
use_colnames=True
)
rules_high = association_rules(
frequent_itemsets_high,
metric="lift",
min_threshold=3.0 # 提升高提升度阈值(聚焦强关联)
).sort_values('confidence', ascending=False)
print("\n高价值客户强关联规则:")
rules_high[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()
茶具系列:客户购买同一系列不同颜色/款式(如 REGENCY TEA PLATE PINK → GREEN → ROSES)的置信度高达 93%,提升度超过 40,表明高价值客户对品牌系列产品的强依赖性和配套需求。客户倾向于购买同一设计语言的产品(如 SMALL CHOCOLATES PINK BOWL → ORANGE BOWL),暴露品牌忠诚度驱动复购的本质。
本质问题:产品线割裂,若客户只能通过多次购买凑齐系列,说明产品组合设计失败,未提供便捷的套装选项,浪费客户信任和钱包份额。
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 定义对比函数
def compare_rules(rules1, rules2, label1, label2):
"""对比两组规则的置信度和提升度分布"""
rules1['Group'] = label1
rules2['Group'] = label2
combined = pd.concat([rules1, rules2], ignore_index=True)
plt.figure(figsize=(12, 6))
sns.boxplot(x='Group', y='confidence', data=combined)
plt.title("置信度分布对比")
plt.show()
plt.figure(figsize=(12, 6))
sns.boxplot(x='Group', y='lift', data=combined)
plt.title("提升度分布对比")
plt.show()
# 执行对比
compare_rules(rules_christmas, rules_high, '圣诞季', '至尊VIP')
VIP客户依赖严重,他们的购买行为很稳定(容易预测),收入太依赖这一小部分人,风险很高。圣诞季促销效果不稳定,活动期间客户行为差异大,说明营销策略不够精准。
核心风险:客户结构不健康,新客增长不足,一旦VIP客户流失,业务可能崩盘。
VIP客户贡献主要收入但结构单一,圣诞季促销效果波动大,需优化新客获取策略以降低风险。
# 保存分场景规则
rules_christmas.to_csv('christmas_rules.csv', index=False)
rules_high.to_csv('high_value_rules.csv', index=False)
print("\n分场景规则已保存为 CSV 文件!")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# ==== 数据预处理 ====
df = pd.read_csv('Online_Retail_Clean.csv', parse_dates=['InvoiceDate'])
# 仅保留有效销售记录
sales = df[df['Quantity'] > 0].copy()
# ==== 按商品统计核心指标 ====
current_date = pd.to_datetime('today') # 动态计算当前日期(或无时间偏差则换固定值)
stock_stats = sales.groupby('StockCode').agg(
Total_Sold=('Quantity', 'sum'), # 总销量
First_Sale=('InvoiceDate', 'min'), # 首次销售日期
Last_Sale=('InvoiceDate', 'max'), # 末次销售日期
Active_Days=('InvoiceDate', 'nunique'), # 有销售记录的天数
Order_Count=('InvoiceNo', 'nunique') # 订单总数
).reset_index()
# 核心计算字段
stock_stats['Sales_Period'] = (stock_stats['Last_Sale'] - stock_stats['First_Sale']).dt.days + 1
stock_stats['Days_Since_Last'] = (current_date - stock_stats['Last_Sale']).dt.days
stock_stats['Avg_Sales_Per_Day'] = stock_stats['Total_Sold'] / stock_stats['Active_Days'] # 日均有销量的日均销量
stock_stats['Turnover_Rate'] = stock_stats['Total_Sold'] / stock_stats['Sales_Period'] # 粗略周转率
# 处理无限值(首次和末次销售同一天场景)
stock_stats.replace([np.inf, -np.inf], np.nan, inplace=True)
stock_stats = stock_stats[stock_stats['Sales_Period'] > 0] # 排除单个销售日的数据异常
# ==== 滞销品多条件判断 ====
def flag_slow_movers(df,
qty_threshold=50, # 总销量低于该值的可能滞销
last_sale_threshold=90, # 超过n天未售出视为滞销
turnover_threshold=0.1 # 周转率低于该值(总销量/销售跨度天数)
):
""" 综合标记滞销商品 """
conditions = (
(df['Total_Sold'] <= qty_threshold) |
(df['Days_Since_Last'] >= last_sale_threshold) |
(df['Turnover_Rate'] <= turnover_threshold)
)
df['Is_Slow_Mover'] = np.where(conditions, 1, 0)
return df
# 调用函数标记(阈值可调整)
stock_stats = flag_slow_movers(stock_stats)
# ==== 结果分析与可视化 ====
print("滞销品总览(按StockCode排序):")
stock_stats.sort_values('Is_Slow_Mover', ascending=False).head()
# 滞销品特征分布
sns.set_theme(style="whitegrid")
plt.figure(figsize=(15,10))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 子图1:滞销品各指标箱线图对比
plt.subplot(2,2,1)
sns.boxplot(x='Is_Slow_Mover', y='Total_Sold', data=stock_stats, showfliers=False)
plt.title('滞销 vs 正常商品总销量分布')
# 子图2:末次销售距今天数分布
plt.subplot(2,2,2)
sns.histplot(stock_stats.query('Is_Slow_Mover == 1')['Days_Since_Last'], bins=30, kde=True)
plt.title('滞销品末次销售距今天数分布')
plt.show()
滞销品销量几乎为 0,正常品占据绝对优势,暴露 “二八定律”彻底失效——长尾商品占比畸高,拖垮整体效率。
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 子图3:周转率排名后50商品展示
plt.figure(figsize=(10,8))
slow_turnover = stock_stats.nsmallest(50, 'Turnover_Rate')
sns.barplot(x='Turnover_Rate', y='StockCode', data=slow_turnover, orient='h')
plt.title('周转率最低Top50商品')
plt.show()
商品如 StockCode 90199B、47369B 周转率趋近于 0,说明这些商品从进仓起就无人问津。每多存一天,仓储成本吞噬利润,现金流濒临断裂。
# 子图4:滞销原因占比饼图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(9,6))
reason_counts = pd.Series({
'总销量过低': len(stock_stats[stock_stats['Total_Sold'] <= 50]),
'长期未动销': len(stock_stats[stock_stats['Days_Since_Last'] >=90]),
'周转率过低': len(stock_stats[stock_stats['Turnover_Rate'] <=0.1])
}, name='滞销原因')
plt.subplot(2,2,4)
reason_counts.plot.pie(autopct='%1.1f%%', startangle=90)
plt.ylabel('')
plt.tight_layout()
plt.show()
长期未动销(75.9%):库存积压如山,商品像僵尸一样占据仓库,现金流被活埋。选品失败,市场调研形同虚设,采购团队闭眼下单。
总销量过低(17.1%):市场需求预测完全失准,产品设计脱离用户需求。
周转率过低(7.1%):库存管理无能,采购量远超实际需求,仓库囤积严重。
# ==== 输出可用于运营的表格 ====
slow_movers_to_check = stock_stats[stock_stats['Is_Slow_Mover'] == 1].sort_values(
by=['Days_Since_Last', 'Total_Sold'], ascending=[False, True]
)
slow_movers_to_check.to_csv('slow_moving_products.csv', index=False)
pip install statsmodels
import pandas as pd
from prophet import Prophet # 注意导入名称已改为 prophet
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# ==== STEP 1. 数据预处理 ====
df = pd.read_csv('Online_Retail_Clean.csv', parse_dates=['InvoiceDate'])
daily_sales = (
df.groupby(['StockCode', pd.Grouper(key='InvoiceDate', freq='D')])['Quantity']
.sum()
.reset_index()
)
# 转换为Prophet要求格式 [ds, y, unique_id]
daily_sales = daily_sales.rename(columns={
'InvoiceDate': 'ds',
'Quantity': 'y',
'StockCode': 'unique_id'
})
# ==== STEP 2. 模型训练与预测函数 ====
def forecast_product_demand(series, periods=30):
"""使用Prophet预测单个商品未来需求"""
model = Prophet(
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=False,
changepoint_prior_scale=0.05 # 降低对突变点的敏感度
)
model.fit(series)
future = model.make_future_dataframe(periods=periods) # 预测30天
forecast = model.predict(future)
return forecast[['ds', 'yhat']].rename(columns={'yhat': 'forecast'})
# ==== STEP 3. 并行预测前N热门商品需求 ====
# Step3.1 筛选高销量商品 (避免计算所有商品)
top_products = daily_sales.groupby('unique_id')['y'].sum().nlargest(50).index.tolist() # 取销量Top50
forecast_results = []
for product in top_products:
product_series = daily_sales[daily_sales['unique_id'] == product]
if len(product_series) < 14: # 过滤数据不足两周的商品
continue
# 执行预测
forecast_df = forecast_product_demand(product_series)
forecast_df['unique_id'] = product
# 计算历史平均销量作为基准
hist_avg = product_series['y'].mean()
forecast_df['forecast'] = forecast_df['forecast'].apply(lambda x: max(0, x)) # 确保非负
forecast_df['forecast_change'] = forecast_df['forecast'] / hist_avg - 1 # 预测增幅
forecast_results.append(forecast_df)
# 合并预测结果
all_forecasts = pd.concat(forecast_results)
all_forecasts
# ==== STEP 4. 统计未来热门商品 ====
future_demand = (
all_forecasts[all_forecasts['ds'] > daily_sales['ds'].max()] # 提取纯预测期
.groupby('unique_id')
.agg(Forecast_Sum=('forecast', 'sum'), # 未来总预测销量
Forecast_Growth=('forecast_change', 'mean')) # 平均预期增长
.reset_index()
.sort_values('Forecast_Sum', ascending=False)
)
print("未来30天热门商品Top10:")
future_demand.head(10)
# ==== STEP 5. 可视化重点商品预测趋势 ====
# 选择增速最快的商品示例
sample_product = future_demand.nlargest(1, 'Forecast_Growth')['unique_id'].iloc[0]
sample_data = all_forecasts[all_forecasts['unique_id'] == sample_product]
plt.figure(figsize=(12,6))
plt.plot(sample_data['ds'], sample_data['forecast'], label='预测需求', color='#FF6B6B')
plt.fill_between(data=sample_data, x='ds', y1=0, y2='forecast', alpha=0.1, color='#FF6B6B')
# 标注历史销量峰值(可选)
plt.title(f'商品 {sample_product} 未来需求预测(过去半年趋势外推)')
plt.xlabel('日期')
plt.ylabel('销量')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
# ==== STEP 6. (选做)输出预测报告 ====
future_demand.to_csv('future_product_demand_forecast.csv', index=False)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
# 数据预处理
df = pd.read_csv('Online_Retail_Clean.csv', encoding='ISO-8859-1')
df = df[df['Quantity'] > 0] # 排除退货或负数量订单
# 按国家统计商品Top10(可调整参数)
def analyze_regional_preference(df, top_n=5):
"""生成各国家最受欢迎商品报告"""
grouped = df.groupby(['Country', 'StockCode'])['Quantity'].sum().reset_index()
grouped['Country'] = grouped['Country'].astype(str) # 确保国家列为字符串类型
# 获取每个国家Top5商品及总销量
top_products = (
grouped.sort_values(['Country', 'Quantity'], ascending=[True, False])
.groupby('Country')
.head(top_n)
.reset_index(drop=True)
)
# 计算商品在本国的销量占比
country_total = grouped.groupby('Country')['Quantity'].sum().reset_index(name='Country_Total')
top_products = top_products.merge(country_total, on='Country')
top_products['Sales_Pct'] = (top_products['Quantity'] / top_products['Country_Total'] *100).round(2)
return top_products
# 生成国家商品偏好矩阵
top_products_df = analyze_regional_preference(df)
top_products_df
# ==== 核心分析1: 识别各国特色商品 ====
def find_unique_preferences(df):
"""找出国别专供商品(在其他国销量占比低于5%)"""
# 计算商品全球分布
global_distribution = df.groupby('StockCode')['Quantity'].sum().reset_index(name='Global_Total')
country_distribution = df.groupby(['StockCode', 'Country'])['Quantity'].sum().reset_index()\
.merge(global_distribution, on='StockCode')
country_distribution['Country_Pct'] = (country_distribution['Quantity'] / country_distribution['Global_Total'] *100).round(1)
# 筛选在本国占主导的商品
dominant_threshold = 5 # 该商品在其他国家销量占总比低于此值视为专属
unique_products = []
for stock_code in country_distribution['StockCode'].unique():
max_pct_country = country_distribution[country_distribution['StockCode'] == stock_code
].nlargest(1, 'Country_Pct')
if max_pct_country['Country_Pct'].values[0] > (100 - dominant_threshold):
unique_products.append({
'StockCode': stock_code,
'Main_Country': max_pct_country['Country'].values[0],
'Dominance_Pct': max_pct_country['Country_Pct'].values[0]
})
return pd.DataFrame(unique_products).sort_values('Dominance_Pct', ascending=False)
# 输出特色商品
print("国家专属商品清单(其他国家占比<5%):")
find_unique_preferences(df).head()
# ==== 核心可视化 ====
countries_of_interest = ['Australia', 'United Kingdom', 'Germany', 'France', 'Netherlands']
# 创建绘图布局
fig = plt.figure(figsize=(18, 12))
gs = GridSpec(2, 3, figure=fig)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 子图1:澳大利亚Top商品销量分布
ax1 = fig.add_subplot(gs[0, 0])
australia_top = top_products_df[top_products_df['Country'] == 'Australia'].head(5)
sns.barplot(x='Sales_Pct', y='StockCode', data=australia_top, palette='Blues_d', ax=ax1)
ax1.set_title('澳大利亚特色商品销量占比 (%)', fontsize=12)
ax1.set_ylabel('商品编号')
# 子图2:各国家Top1商品清单
ax2 = fig.add_subplot(gs[0, 1])
top1_by_country = top_products_df.groupby('Country').head(1).sort_values('Sales_Pct', ascending=False)[:10]
sns.barplot(x='Sales_Pct', y='Country', hue='StockCode', data=top1_by_country,
dodge=False, palette='viridis', ax=ax2)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax2.set_title('各国销量Top1商品及其占比', fontsize=12)
ax2.set_xlabel('国内销量占比 (%)')
# 子图3:商品跨国覆盖热度图(示例4国对比)
ax3 = fig.add_subplot(gs[0, 2])
sample_products = top_products_df[top_products_df['Country'] == 'Australia']['StockCode'][:3].tolist()
heatmap_data = df[df['StockCode'].isin(sample_products)].groupby(
['Country', 'StockCode'])['Quantity'].sum().unstack()
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap='YlGnBu', linewidths=.5, ax=ax3)
ax3.set_title('特色商品跨国热度对比', fontsize=12)
plt.tight_layout()
plt.show()
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time
df = pd.read_csv('./Online_Retail_Clean.csv', encoding='ISO-8859-1')
# 初始化地理编码器(使用 OpenStreetMap 的 Nominatim 服务)
geolocator = Nominatim(user_agent="geo_locator")
# 配置速率限制(避免请求过于频繁被限制)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
# 定义国家名称标准化函数(示例)
def standardize_country_name(country):
country = str(country).strip()
# 处理常见别名(按需扩展)
aliases = {
"United Kingdom": "UK",
"USA": "United States",
"UAE": "United Arab Emirates"
}
return aliases.get(country, country)
# 定义获取经纬度的函数
def get_lat_lng(country):
try:
location = geocode(f"{country}")
if location:
return location.latitude, location.longitude
else:
return None, None
except Exception as e:
print(f"Error for {country}: {e}")
return None, None
# 标准化国家名称
df['Country'] = df['Country'].apply(standardize_country_name)
# 去重国家列表以减少API调用
unique_countries = df['Country'].unique()
# 预先生成国家到经纬度的映射
country_coords = {}
for country in unique_countries:
lat, lng = get_lat_lng(country)
country_coords[country] = (lat, lng)
time.sleep(1) # 进一步防止速率限制
# 将经纬度添加到原始数据中
df['Lat'] = df['Country'].map(lambda x: country_coords.get(x, (None, None))[0])
df['Lng'] = df['Country'].map(lambda x: country_coords.get(x, (None, None))[1])
# 检查结果
print("数据示例(含经纬度):")
df[['Country', 'Lat', 'Lng']].head()
pip install geopy -i https://pypi.tuna.tsinghua.edu.cn/simple
df
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
df = df.dropna(subset=['Lat', 'Lng'])
df = df[(df['Lat'].between(-90,90)) & (df['Lng'].between(-180,180))] # 坐标合法性校验
# 按经纬度网格计算订单密度
df['Lat_round'] = df['Lat'].round(2) # 约5km精度(可调整)
df['Lng_round'] = df['Lng'].round(2)
order_density = df.groupby(['Lat_round', 'Lng_round']).size().reset_index(name='Orders')
order_density.head()
# ==== 仓储选址优化(K-means聚类)====
coords = df[['Lat', 'Lng']].values
# 确定最佳仓库数量(肘部法则)
sse = []
k_range = range(3, 11)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42).fit(coords)
sse.append(kmeans.inertia_)
plt.figure(figsize=(10,4))
plt.plot(k_range, sse, marker='o')
plt.xlabel('仓库数量')
plt.ylabel('距离平方和(SSE)')
plt.title('最佳仓库数量选择(肘部法则)')
plt.grid(True)
plt.show()
# 选择仓库数并训练模型(通过观察k = 3最优)
best_k = 3
kmeans = KMeans(n_clusters=best_k, random_state=42).fit(coords)
df['Warehouse'] = kmeans.labels_
warehouses = pd.DataFrame(kmeans.cluster_centers_, columns=['Warehouse_Lat', 'Warehouse_Lng'])
# ==== 计算距离矩阵 ====
def calculate_distance(row):
"""原始订单到分配仓库的运输距离 (公里)"""
warehouse_coord = (row['Warehouse_Lat'], row['Warehouse_Lng'])
order_coord = (row['Lat'], row['Lng'])
return geodesic(order_coord, warehouse_coord).km
# 关联仓库坐标
df_merged = df.merge(warehouses.reset_index().rename(columns={'index':'Warehouse'}),
on='Warehouse', how='left')
# 计算距离
df_merged['Distance_km'] = df_merged.apply(calculate_distance, axis=1)
df_merged.head()
df_merged['InvoiceDate'] = pd.to_datetime(df_merged['InvoiceDate'])
df_merged['OrderID'] = df_merged['InvoiceDate'].dt.strftime('%Y%m%d') + '_' + df_merged['CustomerID'].astype(str)
df_merged.head()
# ==== 成本分析与优化建议 ====
# 当前平均运输距离
current_avg_distance = df_merged['Distance_km'].mean()
# 模拟优化效果:增加仓库数量后的对比(例如k=7)
kmeans_new = KMeans(n_clusters=7, random_state=42).fit(coords)
df['Warehouse_New'] = kmeans_new.labels_
df_merged_new = df.merge(
pd.DataFrame(kmeans_new.cluster_centers_, columns=['Warehouse_Lat', 'Warehouse_Lng']).reset_index().rename(columns={'index':'Warehouse_New'}),
on='Warehouse_New',
how='left'
)
df_merged_new['Distance_New_km'] = df_merged_new.apply(calculate_distance, axis=1)
optimized_avg_distance = df_merged_new['Distance_New_km'].mean()
# ==== 结果可视化 ====
plt.figure(figsize=(15,10))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 子图1:分销网络地图
ax1 = plt.subplot(2, 2, 1)
sns.scatterplot(x='Lng', y='Lat', data=df, hue='Warehouse',
palette='viridis', alpha=0.6, ax=ax1)
sns.scatterplot(x='Warehouse_Lng', y='Warehouse_Lat', data=warehouses,
color='red', s=200, marker='X', label='仓库', ax=ax1)
plt.title('订单地理分布与仓库选址')
plt.xlabel('经度')
plt.ylabel('纬度')
# 子图2:各仓库覆盖半径分析
ax2 = plt.subplot(2, 2, 2)
warehouse_coverage = df_merged.groupby('Warehouse').agg(
Avg_Distance=('Distance_km', 'mean'),
Total_Orders=('OrderID', 'count')
).reset_index()
sns.barplot(x='Warehouse', y='Avg_Distance', data=warehouse_coverage,
palette='coolwarm', ax=ax2)
plt.title('各仓库平均覆盖距离')
plt.ylabel('平均距离(km)')
# 子图3:优化前后对比
ax3 = plt.subplot(2, 2, 3)
comparison = pd.DataFrame({
'Scenario': ['当前 (5仓)', '优化后 (7仓)'],
'Avg_Distance': [current_avg_distance, optimized_avg_distance]
})
sns.barplot(x='Scenario', y='Avg_Distance', data=comparison, palette='Set2', ax=ax3)
plt.title('物流优化前后平均运输距离对比')
plt.ylabel('公里')
# 子图4:订单密度热力与仓库位置
ax4 = plt.subplot(2, 2, 4)
sns.kdeplot(x=df['Lng'], y=df['Lat'], cmap="Reds", fill=True, alpha=0.6, ax=ax4)
sns.scatterplot(x='Warehouse_Lng', y='Warehouse_Lat', data=warehouses,
color='black', s=150, marker='s', ax=ax4)
plt.title('订单密度热力图与仓库选址')
plt.xlabel('经度')
plt.ylabel('纬度')
plt.tight_layout()
plt.show()
优化后(7台 vs 5台):平均运输距离从 0-100 缩至 0-60,暴露原始仓库布局错误,资源错配严重。订单密度与仓库选址错位,订单密度点集中区与仓库位置偏差显著(如高密度区无仓库覆盖)
业务建议: 对运输距离改善<10% 的新增仓库(如7台→9台),立即关闭。在订单密度Top3区域自建超级仓库,强制周边100公里内订单 24小时达。对低密度区(如 -100 坐标)停止服务,倒逼市场收缩。
# ==== 成本节约估算 ====
cost_per_km = 0.2 # 假设每公里运输成本0.2元(按实际业务调整)
daily_orders = len(df)
annual_saving = (current_avg_distance - optimized_avg_distance) * cost_per_km * daily_orders * 365
print(f"年度预计成本节省:人民币 {annual_saving:,.0f} 元")